#' ---
#' title: Application 1: Candidate Merge
#' author: Joe Ornstein
#' date: 2025-02-27
#' version: 0.3
#' ---

cat('**Application 1: Linking Candidate Names**\n\n')

rm(list = ls())

library(tidyverse)

## Load Merged Datasets -------------------------------

# fuzzylink
model <- 'gpt-4o-2024-11-20'
fmla <- match ~ sim + jw

load(file = paste0('data/candidate-merge/', model, '/',
                   deparse(fmla), '/l2_fuzzylink.RData'))

# fastLink
load('data/candidate-merge/l2_fastLink.RData')

# load hand-labeled pairs
load('data/candidate-merge/hand_labels.RData')
load(file = paste0('data/candidate-merge/', model, '/',
                   deparse(fmla), '/recall.RData'))


## Precision and Recall ---------------------------

# fuzzylink
fuzzylink_matches <- df |> 
  select(A,B,block) |> 
  unique() |> 
  filter(!is.na(B)) |> 
  mutate(exact_match = if_else(A==B, 'Yes','No')) |> 
  left_join(hand_labels) |> 
  mutate(true_match = as.numeric(exact_match == 'Yes' | hand_label == 'Yes'))

cat('Precision (fuzzylink):', mean(fuzzylink_matches$true_match), '\n')

# recall = true_positives / (true_positives + false_negatives)
true_positives <- sum(fuzzylink_matches$true_match)
false_negatives <- sum(recall$match_in_L2)

cat('Recall (fuzzylink):', true_positives / (true_positives + false_negatives), '\n')

# fastLink
load('data/candidate-merge/fastLink_precision_validated.RData')
cat('Precision (fastLink):', mean(fastLink_precision$validated), '\n')

# recall: matches found / total matches (same denominator as before)
cat('Recall (fastLink):', sum(fastLink_precision$validated) / (true_positives + false_negatives), '\n')


## Figure A4

## Figure

